library(readr)
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(ggplot2)
library(purrr)
library(ggbeeswarm)
library(scales)
Attaching package: ‘scales’
The following object is masked from ‘package:purrr’:
discard
The following object is masked from ‘package:readr’:
col_factor
library(stringr)
library(jsonlite)
Attaching package: ‘jsonlite’
The following object is masked from ‘package:purrr’:
flatten
library(forcats)
library(ggbreak)
ggbreak v0.1.3 Learn more at https://yulab-smu.top/
If you use ggbreak in published research, please cite the following paper:
S Xu, M Chen, T Feng, L Zhan, L Zhou, G Yu. Use ggbreak to effectively utilize plotting space to deal with large datasets and
outliers. Frontiers in Genetics. 2021, 12:774846. doi: 10.3389/fgene.2021.774846
library(cowplot)
library(tidyr)
library(hexbin)
# Map antiSMASH classes to their categories
bgc_class <- read_tsv("./data/2025-01-16-1256-bgc_class_ref.tsv") %>% select(!owner_id)
Rows: 87 Columns: 5── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (3): class_name, full_name, bgc_category
dbl (2): bgc_class_id, owner_id
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
class_to_cat <- bgc_class$bgc_category
names(class_to_cat) <- bgc_class$class_name
# NCBI Taxonomy data for Cyanobacteriota assemblies
cyano_asm_tax <- read_tsv("data/cyano_asm_tax.tsv")
Rows: 6108 Columns: 11── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (9): assembly_accession, tax_name, superkingdom, phylum, class, order, family, genus, species
dbl (1): tax_id
lgl (1): kingdomError: no more error handlers available (recursive errors?); invoking 'abort' restart
Warning in (function (bibtype, textVersion, header = NULL, footer = NULL, :
type 29 is unimplemented in 'type2char'
Error in (function (bibtype, textVersion, header = NULL, footer = NULL, :
INTEGER() can only be applied to a 'integer', not a 'unknown type #29'
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# bgc_tax <- read_tsv("./data/2025-01-16-1522-cyano_bgc_tax.tsv")
# From SMC: All antiSMASH BGC calls for Cyanobacteriota genomes
# Need to clean a bit and add categories
regions <- read_tsv("./data/2025-02-26-1456-cyano_as_regions.tsv")
Rows: 32112 Columns: 21── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (10): contig_name, region_class, accession_id, data_source_description, tax_phylum, tax_class, tax_order, tax_family, tax_genus, tax_species
dbl (9): region_gene_id, bgc_id, region_length, region_start_nt, region_end_nt, bgc_annotation_id, smc_id, size_bp, n_scaffolds
lgl (2): region_category, contig_edge
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Table of NCBI assemblies at "chromosome" or "complete" quality levels
ncbi_hiq_meta <- read_tsv("data/ncbi_cyano_HiQualityGenomes_metadata.tsv")
Rows: 971 Columns: 25── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (11): Assembly Accession, Assembly Name, Organism Name, ANI Check status, Organism Infraspecific Names Strain, Organism Infraspecific Names...
dbl (9): Organism Taxonomic ID, Assembly Stats Total Sequence Length, Assembly Stats Total Number of Chromosomes, Assembly Stats Contig N50, A...
lgl (4): Organism Infraspecific Names Breed, Organism Infraspecific Names Cultivar, Organism Infraspecific Names Ecotype, Organism Infraspecif...
date (1): Assembly Release Date
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Convert 'class' to vector, add in 'category' as vector, order levels of classes and categories
regions <- regions %>%
mutate(classes = map(region_class, function(class_string) (
if ( str_starts(class_string, fixed("[")) )
fromJSON(class_string)
else
c(class_string)
))) %>%
mutate(categories = classes %>% map(function(cls_vec) (
map_vec(cls_vec, function(cls_str) class_to_cat[[cls_str]]) %>% unique() %>% sort())
)) %>%
mutate(cats_str = categories %>% map_chr(function(x) str_flatten(x, collapse = ", "))) %>%
add_count(cats_str) %>%
mutate(
cats_str = forcats::fct_reorder(cats_str, desc(n)),
class_str = classes %>% map_chr(function(x) str_flatten(x, collapse = ", "))
)
regions
How fragmented are the full set of genomes, and how does that impact BGC counts?
regions %>%
group_by(smc_id, n_scaffolds, contig_edge) %>%
summarize(n_bgcs = n()) %>%
ungroup() %>%
ggplot(aes(x = n_scaffolds, y = n_bgcs)) +
stat_bin_hex(bins = 50) +
scale_x_log10(breaks = breaks_log()) +
guides(fill = guide_colorbar(title = "# Genomes")) +
facet_wrap(. ~ contig_edge, ncol = 1, labeller = as_labeller(c("FALSE" = "BGCs not on contig edge", "TRUE" = "BGCs on contig edge"))) +
theme_bw() +
ggtitle('Fragmented genomes have inflated BGC counts', subtitle = 'Full dataset')
`summarise()` has grouped output by 'smc_id', 'n_scaffolds'. You can override using the `.groups` argument.
How does the proportion of BGCs off/on a contig edge change if we filter for high-quality genomes?
filters_df <- bind_rows(
regions %>%
group_by(contig_edge) %>%
summarize(filter = "Unfiltered", n = n()) %>%
ungroup() %>%
mutate(pct = 100 * n / sum(n)),
regions %>%
semi_join(ncbi_hiq_meta, by = join_by(accession_id == `Assembly Accession`)) %>%
group_by(contig_edge) %>%
summarize(filter = "Complete/Chromosome", n = n()) %>%
mutate(pct = 100 * n / sum(n)),
regions %>%
filter(n_scaffolds < 30) %>%
group_by(contig_edge) %>%
summarize(filter = "< 30 Scaffolds", n = n()) %>%
mutate(pct = 100 * n / sum(n)),
regions %>%
filter(n_scaffolds < 50) %>%
group_by(contig_edge) %>%
summarize(filter = "< 50 Scaffolds", n = n()) %>%
mutate(pct = 100 * n / sum(n)),
regions %>%
filter(n_scaffolds < 100) %>%
group_by(contig_edge) %>%
summarize(filter = "< 100 Scaffolds", n = n()) %>%
mutate(pct = 100 * n / sum(n)),
)
ggplot(filters_df, aes(x = filter, y = n)) +
geom_col(aes(fill = fct_rev(as_factor(contig_edge))), position = position_stack()) +
scale_x_discrete(name = "", limits = c("Unfiltered", "< 100 Scaffolds", "< 50 Scaffolds", "< 30 Scaffolds", "Complete/Chromosome")) +
scale_y_continuous(name = "Number of BGCs", labels = label_comma()) +
scale_fill_manual(name = "BGC on contig edge", values = c("gray80", "black")) +
theme_classic() +
coord_flip()
# filter off / on contig edge
# n_scaffolds < 30 5415 / 513 (91% off)
# n_scaffolds < 50 6320 / 1203 (84% off)
# n_scaffolds < 100 8176 / 3056 (73% off)
# only HiQ 2885 / 68 (98%)
regions <- regions %>%
semi_join(ncbi_hiq_meta, by = join_by(accession_id == `Assembly Accession`))
regions %>%
group_by(smc_id) %>%
mutate(n_bgcs = n()) %>%
select(tax_genus, n_bgcs, n_scaffolds) %>%
distinct() %>%
ggplot(aes(x = n_scaffolds, y = n_bgcs)) +
stat_bin_hex(bins = 50) +
# geom_point(alpha = 0.3, pch=21) +
# geom_beeswarm() +
scale_x_log10(breaks = breaks_log()) +
guides(fill = guide_colorbar(title = "# Genomes")) +
# scale_x_continuous(transform = transform_pseudo_log(base=10), breaks = breaks_log())
theme_bw()
Adding missing grouping variables: `smc_id`
Summary statistics of BGC length across BGC categories
region_summary <- regions %>%
group_by(cats_str) %>%
mutate(n = n()) %>%
group_by(cats_str, n) %>%
summarize_at(
.vars = vars(region_length),
.funs = list(
min_len = min,
max_len = max,
mean_len = mean,
median_len = median,
sd = sd
)
) %>%
arrange(desc(n))
region_summary
write_tsv(region_summary, "./region_summary.tsv")
# Purely informational plot -- Not needed for pub (since we have the table for it)
category_counts <- region_summary %>%
ggplot(aes(y = reorder(cats_str, desc(n)))) +
geom_col(aes(x = n)) +
scale_x_continuous(name = "BGC count", breaks = breaks_width(100)) +
scale_y_discrete(name = "BGC Category") +
theme_bw() +
ggtitle('Number of BGCs in dataset, divided by category')
category_counts
ggsave("./figs/svg/category_counts.svg", category_counts, device = "svg")
Saving 7.29 x 4.51 in image
ggsave("./figs/png/category_counts.png", category_counts, device = "png")
Saving 7.29 x 4.51 in image
Number of BGCs by category - lumping all hybrids except NRPS-PKS
# Lump any hybrid category with fewer than 80 BGCs into an "all other" category
region_summary_lumped <- region_summary %>%
mutate(
group = if_else(n < 80, "All other hybrids", cats_str),
group = group %>% fct_reorder(n)
)
# Keep a reference DF handy for which categories got lumped
lump_groups <- region_summary_lumped %>% select(cats_str, group)
# Use the MIBiG / antiSMASH coloring scheme
cat_colors <- c(
"Polyketide" = "#f4a460",
"NRP" = "#2e8b57",
"RiPP" = "#4169e1",
"Terpene" = "purple",
"Saccharide" = "#deb887",
"Other" = "#191970",
"NRP, Polyketide" = "lightsteelblue",
"All other hybrids" = "gray50"
)
# Plot it
lumped_category_counts <- region_summary_lumped %>%
ggplot(aes(y = reorder(group, n))) +
geom_col(aes(x = n, fill = group)) +
scale_x_continuous(name = "BGC count", breaks = breaks_extended()) +
scale_y_discrete(name = "BGC Category") +
scale_fill_manual(values = cat_colors) +
theme_bw() +
guides(fill = "none")
lumped_category_counts
ggsave("./figs/svg/category_counts_lumped.svg", lumped_category_counts, device = "svg")
Saving 7.29 x 4.51 in image
ggsave("./figs/png/category_counts_lumped.png", lumped_category_counts, device = "png")
Saving 7.29 x 4.51 in image
Distribution of BGC lengths by category (or combination of categories)
my_breaks <- c(1, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 500000)
regions_lumped <- regions %>% left_join(lump_groups, by = 'cats_str')
region_hist <- ggplot(regions_lumped, aes(
x = region_length / 1000,
# y = categories
)) +
geom_histogram(aes(fill=group), bins=50) +
scale_x_log10(name = "BGC length (kb)", guide = "axis_logticks", breaks = breaks_log(), labels = label_comma()) +
scale_y_continuous(name = "BGC count", breaks = breaks_extended(), labels = label_comma()) +
scale_fill_manual(values = cat_colors) +
facet_grid(rows = vars(cats_str), scales = "free_y") +
theme_bw() +
theme(strip.text.y.right = element_text(angle = 0)) +
guides(fill = FALSE)
Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as of ggplot2 3.3.4.
region_hist
ggsave("./figs/svg/region_hist.svg", region_hist, device = "svg")
Saving 10 x 10 in image
ggsave("./figs/png/region_hist.png", region_hist, device = "png")
Saving 10 x 10 in image
Same, but lump hybrid categories together, except for NRPS-PKS
region_hist_lumped <- regions_lumped %>%
filter(group != "All other hybrids") %>%
ggplot(aes(x = region_length / 1000)) +
geom_histogram(aes(fill=group), bins=50) +
scale_x_log10(name = "BGC length (kb)", guide = "axis_logticks", limits = c(1, NA), breaks = c(1, 5, 10, 50, 100, 200)) +
scale_y_continuous(name = "BGC count", breaks = breaks_extended(n = 3)) +
scale_fill_manual(values = cat_colors) +
facet_wrap(vars(group), ncol = 1, scales = "free_y") +
guides(fill = guide_legend(title = "BGC Category")) +
theme_bw() +
theme(strip.background = element_blank(),
strip.text = element_blank())
region_hist_lumped
ggsave("./figs/svg/region_hist_lumped.svg", region_hist_lumped, device = "svg")
Saving 7.29 x 4.51 in image
ggsave("./figs/png/region_hist_lumped.png", region_hist_lumped, device = "png")
Saving 7.29 x 4.51 in image
Total BGCs divided by tax_genus, colored by BGC category
tax_count <- regions_lumped %>%
group_by(tax_genus, cats_str) %>%
summarize(num_bgcs = n()) %>%
mutate(tax_genus = tax_genus %>% fct_reorder(num_bgcs)) %>%
left_join(lump_groups, by = "cats_str") #%>%
`summarise()` has grouped output by 'tax_genus'. You can override using the `.groups` argument.
# group_by(tax_genus) %>%
# summarize(tot_bgcs = sum(num_bgcs))
tax_count
p_all <- tax_count %>%
group_by(tax_genus) %>%
filter(sum(num_bgcs) > 0) %>%
ggplot(aes(x = fct_infreq(tax_genus, w = num_bgcs))) +
geom_col(aes(y = num_bgcs, fill = group), position = position_stack(reverse = TRUE)) +
scale_y_continuous(name = "BGC count", breaks = breaks_extended()) +
scale_x_discrete(name = "Genus") +
scale_fill_manual(name = "BGC Category", values = cat_colors) +
coord_flip() +
theme_bw() +
ggtitle("(all genera)")
p_all
ggsave("./figs/svg/genus_counts_all.svg", p_all, device = "svg")
Saving 7.29 x 4.51 in image
ggsave("./figs/png/genus_counts_all.png", p_all, device = "png")
Saving 7.29 x 4.51 in image
cyano_nohits <- read_tsv('data/ncbi_cyano_nohit_accs.txt', col_names=c('accession_id')) %>%
left_join(cyano_asm_tax, by = join_by(accession_id == assembly_accession))
Rows: 186 Columns: 1── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): accession_id
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cyano_nohit_genus_counts <- cyano_nohits %>%
group_by(genus) %>%
summarize(n_nohits = n()) %>%
mutate(genus = replace_na(genus, "Unclassified")) %>%
arrange(genus)
genomes_by_genus <- read_tsv('data/2025-02-25-1442-cyano_smc_src_counts_by_genus.tsv')
Rows: 172 Columns: 2── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): tax_genus
dbl (1): n_sources
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
genomes_by_genus <- genomes_by_genus %>%
left_join(cyano_nohit_genus_counts, by = join_by(tax_genus == genus)) %>%
mutate(n_nohits = replace_na(n_nohits, 0)) %>%
mutate(tot_genomes = n_nohits + n_sources, .keep = "unused")
BGCs per genome (labeled with total BGCs) with a side panel for genome count
df_plots <- tax_count %>%
left_join(genomes_by_genus, by = 'tax_genus') %>%
mutate(bgc_dens = num_bgcs/tot_genomes)
p_bgc_dens <- df_plots %>%
ggplot(aes(x = fct_infreq(tax_genus, w=bgc_dens))) +
geom_col(aes(y = bgc_dens, fill = group), position = position_stack(reverse = TRUE)) +
geom_text(
aes(
y = tot_bgc_dens,
x = tax_genus,
label = format(tot_bgcs, big.mark = ',', trim = TRUE)
),
data = df_plots %>%
group_by(tax_genus) %>%
mutate(tot_bgc_dens = sum(bgc_dens), tot_bgcs = sum(num_bgcs)) %>%
select(tax_genus, tot_bgc_dens, tot_bgcs) %>%
distinct(),
position = position_dodge(0.9),
hjust = -0.2
) +
scale_y_continuous(name = "BGCs per genome", breaks = seq(0,55,5), limits = c(0,55)) +
scale_x_discrete(name = "Genus") +
scale_fill_manual(name = "BGC Category", values = cat_colors) +
coord_flip() +
theme_bw() +
theme(legend.position = "bottom")
# p_bgc_dens
genome_counts <- df_plots %>%
group_by(tax_genus, tot_genomes) %>%
summarize(
tot_bgc_dens = sum(bgc_dens),
tot_bgcs = sum(num_bgcs)
) %>%
arrange(tot_bgc_dens)
genome_counts
p_genome_ct <- genome_counts %>%
ggplot(aes(x = fct_infreq(tax_genus, w=tot_bgc_dens))) +
geom_col(aes(y = tot_genomes)) +
scale_y_continuous(trans = transform_pseudo_log(base=10), breaks = c(0, 1, 5, 10, 20, 50, 100, 200, 500, 1000)) +
coord_flip() +
theme_bw() +
theme(
axis.title.y = element_blank(),
axis.text.y = element_blank()
)
# p_genome_ct
p1 <- plot_grid(p_bgc_dens, p_genome_ct, align = "h", rel_widths = c(2,1))
p1
ggsave("./figs/png/split.png", p1, width = 8, height = 11, device = "png")
Same thing but filter for genera with at least 5 genomes
df_plots <- tax_count %>%
left_join(genomes_by_genus, by = 'tax_genus') %>%
mutate(bgc_dens = num_bgcs/tot_genomes) %>%
filter(tot_genomes >= 5)
p_bgc_dens <- df_plots %>%
ggplot(aes(x = fct_infreq(tax_genus, w=bgc_dens))) +
geom_col(aes(y = bgc_dens, fill = group), position = position_stack(reverse = TRUE)) +
geom_text(
aes(y = tot_bgc_dens, x = tax_genus, label = format(tot_bgcs, big.mark = ',', trim = TRUE)),
data = df_plots %>% group_by(tax_genus) %>% mutate(tot_bgc_dens = sum(bgc_dens), tot_bgcs = sum(num_bgcs)) %>% select(tax_genus, tot_bgc_dens, tot_bgcs) %>% distinct(),
position = position_dodge(0.9),
hjust = -0.2
) +
scale_y_continuous(name = "BGCs per genome", breaks = seq(0,25,5), limits = c(0,20)) +
scale_x_discrete(name = "Genus") +
scale_fill_manual(name = "BGC Category", values = cat_colors) +
coord_flip() +
theme_bw() +
theme(legend.position = "bottom") +
ggtitle('>= 5 genomes in genus')
# p_bgc_dens
genome_counts <- df_plots %>%
group_by(tax_genus, tot_genomes) %>%
summarize(tot_bgc_dens = sum(bgc_dens), tot_bgcs = sum(num_bgcs)) %>%
arrange(tot_bgc_dens)
genome_counts
p_genome_ct <- genome_counts %>%
ggplot(aes(x = fct_infreq(tax_genus, w=tot_bgc_dens))) +
geom_col(aes(y = tot_genomes)) +
scale_y_continuous(trans=scales::transform_pseudo_log(base=10), breaks = c(0, 1, 5, 10, 20, 50, 100, 200, 500, 1000)) +
coord_flip() +
theme_bw() +
theme(
axis.title.y = element_blank(),
axis.text.y = element_blank()
)
# p_genome_ct
p2 <- plot_grid(p_bgc_dens, p_genome_ct, align = "h", rel_widths = c(2,1))
p2
ggsave("./figs/png/split_5genomes.png", p2, width = 8, height = 11, device = "png")
Same but instead filter for genera with at least 50 BGCs
df_plots <- tax_count %>%
left_join(genomes_by_genus, by = 'tax_genus') %>%
mutate(bgc_dens = num_bgcs/tot_genomes) %>%
group_by(tax_genus) %>% mutate(tot_bgc_dens = sum(bgc_dens), tot_bgcs = sum(num_bgcs)) %>%
filter(tot_bgcs >= 50)
p_bgc_dens <- df_plots %>%
ggplot(aes(x = fct_infreq(tax_genus, w=bgc_dens))) +
geom_col(aes(y = bgc_dens, fill = group), position = position_stack(reverse = TRUE)) +
geom_text(
aes(y = tot_bgc_dens, x = tax_genus, label = format(tot_bgcs, big.mark = ',', trim = TRUE)),
data = df_plots %>% select(tax_genus, tot_bgc_dens, tot_bgcs) %>% distinct(),
position = position_dodge(0.9),
hjust = -0.2
) +
scale_y_continuous(name = "BGCs per genome", breaks = seq(0,25,5), limits = c(0,20)) +
scale_x_discrete(name = "Genus") +
scale_fill_manual(name = "BGC Category", values = cat_colors) +
coord_flip() +
theme_bw() +
theme(legend.position = "bottom") +
ggtitle('>= 50 BGCs in genus')
# p_bgc_dens
genome_counts <- df_plots %>%
group_by(tax_genus, tot_genomes) %>%
summarize(tot_bgc_dens = sum(bgc_dens), tot_bgcs = sum(num_bgcs)) %>%
arrange(tot_bgc_dens)
genome_counts
p_genome_ct <- genome_counts %>%
ggplot(aes(x = fct_infreq(tax_genus, w=tot_bgc_dens))) +
geom_col(aes(y = tot_genomes)) +
scale_y_continuous(trans=scales::transform_pseudo_log(base=10), breaks = c(0, 1, 5, 10, 20, 50, 100, 200, 500, 1000)) +
coord_flip() +
theme_bw() +
theme(
axis.title.y = element_blank(),
axis.text.y = element_blank()
)
# p_genome_ct
p3 <- plot_grid(p_bgc_dens, p_genome_ct, align = "h", rel_widths = c(2,1))
p3
ggsave("./figs/png/split_50bgcs.png", p3, width = 8, height = 11, device = "png")
Plot the BGCs per genome (normalized to 100%) alongside number of BGCs per genus and genomes per genus
df_plots <- tax_count %>%
left_join(genomes_by_genus, by = 'tax_genus') %>%
mutate(bgc_dens = num_bgcs/tot_genomes) %>%
group_by(tax_genus) %>% mutate(tot_bgc_dens = sum(bgc_dens), tot_bgcs = sum(num_bgcs))
p_bgc_dens <- df_plots %>%
ggplot(aes(x = fct_rev(tax_genus))) +
geom_col(aes(y = bgc_dens, fill = group), position = position_fill(reverse = TRUE)) +
scale_y_continuous(name = "BGC proportion") +
scale_x_discrete(name = "Genus") +
scale_fill_manual(name = "BGC Category", values = cat_colors) +
coord_flip() +
theme_bw() +
theme(legend.position = "bottom")
# p_bgc_dens
p_bgc_ct <- df_plots %>%
ggplot(aes(x = fct_rev(tax_genus))) +
geom_col(aes(y = tot_bgcs), data = df_plots %>% select(tax_genus, tot_bgcs, tot_bgc_dens) %>% distinct()) +
scale_y_continuous(
name = "BGC count",
trans = transform_pseudo_log(base = 10),
breaks = c(0, 1, 5, 10, 20, 50, 100, 200, 500)
) +
coord_flip() +
theme_bw() +
theme(
axis.title.y = element_blank(),
axis.text.y = element_blank()
)
genome_counts <- df_plots %>%
group_by(tax_genus, tot_genomes) %>%
summarize(tot_bgc_dens = sum(bgc_dens), tot_bgcs = sum(num_bgcs)) %>%
arrange(tot_bgc_dens)
`summarise()` has grouped output by 'tax_genus'. You can override using the `.groups` argument.
genome_counts
p_genome_ct <- genome_counts %>%
ggplot(aes(x = fct_rev(tax_genus))) +
geom_col(aes(y = tot_genomes)) +
scale_y_continuous(
name = "Genome count",
trans=transform_pseudo_log(base = 10),
breaks = c(0, 1, 5, 10, 20, 50, 100, 200, 500, 1000)
) +
coord_flip() +
theme_bw() +
theme(
axis.title.y = element_blank(),
axis.text.y = element_blank()
)
# p_genome_ct
p4 <- plot_grid(p_bgc_dens, p_bgc_ct, p_genome_ct, align = "h", rel_widths = c(3,1,1), nrow=1)
p4
ggsave("./figs/png/split_proportional.png", p4, width = 8, height = 11, device = "png")
Plot the BGCs per genome (not normalized to 100%) alongside number of BGCs per genus and genomes per genus
df_plots <- tax_count %>%
left_join(genomes_by_genus, by = 'tax_genus') %>%
mutate(bgc_dens = num_bgcs/tot_genomes) %>%
group_by(tax_genus) %>%
mutate(tot_bgc_dens = sum(bgc_dens), tot_bgcs = sum(num_bgcs))
p_bgc_dens <- df_plots %>%
ggplot(aes(x = fct_reorder(tax_genus, tot_bgc_dens, .desc=T))) +
geom_col(aes(y = bgc_dens, fill = group), position = position_stack(reverse = TRUE)) +
scale_y_continuous(name = "BGCs per genome") +
scale_x_discrete(name = "Genus") +
scale_fill_manual(name = "BGC Category", values = cat_colors) +
coord_flip() +
guides(fill = guide_legend(position = "inside")) +
theme_bw() +
theme(legend.justification.inside = c(0.99,0.99))
# p_bgc_dens
p_bgc_ct <- df_plots %>%
ggplot(aes(x = fct_reorder(tax_genus, tot_bgc_dens, .desc=T))) +
geom_col(aes(y = tot_bgcs), data = df_plots %>% select(tax_genus, tot_bgcs, tot_bgc_dens) %>% distinct()) +
scale_y_continuous(
name = "BGC count",
trans = transform_pseudo_log(base = 10),
breaks = c(0, 1, 5, 10, 20, 50, 100, 200, 500)
) +
coord_flip() +
theme_bw() +
theme(
axis.title.y = element_blank(),
axis.text.y = element_blank()
)
genome_counts <- df_plots %>%
group_by(tax_genus, tot_genomes) %>%
summarize(tot_bgc_dens = sum(bgc_dens), tot_bgcs = sum(num_bgcs)) %>%
arrange(tot_bgc_dens)
`summarise()` has grouped output by 'tax_genus'. You can override using the `.groups` argument.
genome_counts
p_genome_ct <- genome_counts %>%
ggplot(aes(x = fct_reorder(tax_genus, tot_bgc_dens, .desc=T))) +
geom_col(aes(y = tot_genomes)) +
scale_y_continuous(name = "Genome count", trans=scales::transform_pseudo_log(base=10), breaks = c(0, 1, 5, 10, 20, 50, 100, 200, 500, 1000)) +
coord_flip() +
theme_bw() +
theme(
axis.title.y = element_blank(),
axis.text.y = element_blank()
)
# p_genome_ct
p5 <- plot_grid(p_bgc_dens, p_bgc_ct, p_genome_ct, align = "h", rel_widths = c(3,1,1), nrow=1)
df_plots
p5
ggsave("./figs/png/split_triple.png", p4, width = 8, height = 11, device = "png")
p_a <- p5 + ggtitle("")
p_b <- lumped_category_counts
p_c <- region_hist_lumped +
guides(fill = "none")
bottom_row <- plot_grid(p_b, p_c, nrow = 1, labels = c("B", "C"), label_size = 12)
Warning: Removed 6 rows containing missing values or values outside the scale range (`geom_bar()`).
fig2 <- plot_grid(p_a, bottom_row, nrow = 2, labels = c("A", ""), label_size = 12, rel_heights = c(2,1))
fig2
ggsave("./figs/_fig2.png", fig2, width = 7, height = 7, device = "png")
ggsave("./figs/_fig2.pdf", fig2, width = 7, height = 7, device = "pdf")